library(survival)
library(KMsurv)
library(ggplot2)
library(ggpubr)
library(survminer)
library(plotly)
library(muhaz)
library(ggthemes)
vars <- c("id", "clinic", "status", "time", "prison", "methadone")
addicts <-
  read.table(
    url(
      "https://dmrocke.ucdavis.edu/Class/BST222.2022.Fall/addicts.txt"
    ),
    header = F,
    col.names = vars
  )
#addicts

Question 1

#change variables to factors to be used in Cox PH
addicts$clinic <- factor(addicts$clinic,labels=c("Clinic1","Clinic2"))
addicts$prison <- factor(addicts$prison,labels=c("No","Yes"))

head(addicts)
##   id  clinic status time prison methadone
## 1  1 Clinic1      1  428     No        50
## 2  2 Clinic1      1  275    Yes        55
## 3  3 Clinic1      1  262     No        55
## 4  4 Clinic1      1  183     No        30
## 5  5 Clinic1      1  259    Yes        65
## 6  6 Clinic1      1  714     No        55
surv_object <- Surv(time = addicts$time, event = addicts$status)
addicts.cox <- coxph(surv_object ~ clinic + prison + methadone, data = addicts)
#summary(addicts.cox)

addicts.zph <- cox.zph(addicts.cox)
addicts.zph
##            chisq df       p
## clinic    11.159  1 0.00084
## prison     0.839  1 0.35969
## methadone  0.515  1 0.47310
## GLOBAL    12.287  3 0.00646
#For clinic
ggcoxzph(addicts.zph[1], se = FALSE, font.main = 12, ggtheme = theme_solarized(), point.col = "#0096FF", ylim = c(-7,7))

#For prison
ggcoxzph(addicts.zph[2], se = FALSE, font.main = 12, ggtheme = theme_solarized(), point.col = "#0096FF", ylim = c(-4,4))

#For methadone/
ggcoxzph(addicts.zph[3], se = FALSE, font.main = 12, ggtheme = theme_solarized(), point.col = "#0096FF", ylim = c(-0.5,0.5))

When we look at the plots of our Schoenfield Residuals, as well as our p-values, we can see that both methadone and prison variables are significantly proportional while our clinic variable violates our proportinality assumption at any reasonable \(\alpha\).

Question 2

addicts.mart <- residuals(addicts.cox, type = "martingale")

addicts.cs <- addicts$status - addicts.mart

#Cumulative hazard 
surv.csr <- survfit(Surv(addicts.cs, addicts$status) ~1, type = "fleming-harrington", data = addicts)

#plot(surv.csr, fun ="cumhaz")


cumHazPlot <-
  ggsurvplot(
    surv.csr,
    fun = "cumhaz",
    conf.int = TRUE,
    palette = c("#581845"),
    ggtheme = theme_solarized()
  ) + ggtitle("Cumulative Hazard for Cox-Snell Residuals")
#ggplotly(cumHazPlot[[1]])


ggplotly(cumHazPlot$plot + geom_abline())

We can see that since our Cox-Snell residuals curve fits a standard line quite well, we can say we do not observe a lack of fit.

Question 3

#This time, without methadone
addicts.coxNoMeth <- coxph(surv_object ~ clinic + prison, data = addicts)
addicts.mart <- residuals(addicts.cox, type = "martingale")


lowessOBJ <- as.data.frame(lowess(addicts$methadone, addicts.mart))

ggplotly(
  ggplot() + aes(addicts$methadone, addicts.mart) + geom_point(col = "#FFA000") + labs(x = "Methadone", y = "Martingale Residuals", title = "Martingale Residuals vs. Methadone Dosage") + geom_line(data = lowessOBJ, aes(x = x, y = y), col = "#388E3C") + theme_solarized()
)
#Create categorical methadone
addicts$catMeth <-
  cut(addicts$methadone,
      c(0, 60, 79, 200),
      labels = c("<60", "60-79", "80+"))

addicts$catMeth <- as.factor(addicts$catMeth)

#New cox model
addicts.coxCatMeth <-
  coxph(surv_object ~ clinic + prison + catMeth, data = addicts)


#We print AIC using drop1
# This is with methadone as quantitative
drop1(addicts.cox, test = "Chisq")
## Single term deletions
## 
## Model:
## surv_object ~ clinic + prison + methadone
##           Df    AIC     LRT  Pr(>Chi)    
## <none>       1352.5                      
## clinic     1 1376.9 26.3506 2.847e-07 ***
## prison     1 1354.3  3.7727   0.05209 .  
## methadone  1 1381.3 30.7820 2.887e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#This is with methadone as categorical 
drop1(addicts.coxCatMeth, test = "Chisq")
## Single term deletions
## 
## Model:
## surv_object ~ clinic + prison + catMeth
##         Df    AIC     LRT  Pr(>Chi)    
## <none>     1358.0                      
## clinic   1 1373.9 17.9570 2.259e-05 ***
## prison   1 1357.8  1.8475    0.1741    
## catMeth  2 1381.3 27.3215 1.167e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can see that the model with methadone as a quantitative has a better AIC than the one with methadone as categorical. Along with that, the quantitative methadone model has has a smaller p-value for the ChiSq test, providing greater statistically significant strentgh. We can conclude that replacing the quantitavie methadone with a 3-category factor does matter, as it hurts the strength of our model.

Question 4

# With Linear predictor
addicts.predict <- predict(addicts.cox)

#Deviance and df beta
addicts.dev <- residuals(addicts.cox, type = "deviance")
addicts.dfb <- residuals(addicts.cox, type = "dfbeta")

#MArtingale vs Linear Predictor
ggplotly(
  ggplot() + aes(
    x = addicts.predict,
    y = addicts.mart,
    label = rownames(addicts)
  ) + geom_point() + labs(x = "Linear Predictor", y = "Martingale Residual", title = "Martingale Residuals vs Linear Predictor") + theme_solarized()
)
# Deviance vs Linear Predictor
ggplotly(
  ggplot() + aes(
    x = addicts.predict,
    y = addicts.dev,
    label = rownames(addicts)
  ) + geom_point() + labs(x = "Linear Predictor", y = "Deviance Residual", title = "Deviance Residuals vs Linear Predictor") + theme_solarized()
)
# DfBeta vs Linear Predictor
# Clinic
ggplotly(
  ggplot() + aes(x = as.numeric(rownames(addicts)), y = addicts.dfb[, 1]) + geom_point() + labs(x = "Observation Number", y = "dfbeta for Clinic Type", title = "dfbeta Values for Observation Number by Clinic Type") + theme_solarized()
)
# Prison
ggplotly(
  ggplot() + aes(x = as.numeric(rownames(addicts)), y = addicts.dfb[, 2]) + geom_point() + labs(x = "Observation Number", y = "dfbeta for Prison Status", title = "dfbeta Values for Observation Number by Prison Status") + theme_solarized()
)
#Methadone
ggplotly(
  ggplot() + aes(x = as.numeric(rownames(addicts)), y = addicts.dfb[, 3]) + geom_point() + labs(x = "Observation Number", y = "dfbeta for Methadone Level", title = "dfbeta Values for Observation Number by Methadone Level") + theme_solarized()
)

Observations to Examine by Residuals and Influence

  • Martingale Residuals
    • 8, 9, 26
  • Deviance Residuals
    • 8, 26, 187
  • Clinic Influence
    • 111, 131, 136
  • Prison Influence
    • 8, 12, 26
  • Methadone Level Influence
    • 24, 70, 86
    From the residual plots, the most important pbservations to examine would be 8 and 26. Let’s observe these two observations:
addicts[c(8, 26),]
##    id  clinic status time prison methadone catMeth
## 8   8 Clinic1      0  796    Yes        60     <60
## 26 27 Clinic1      0  566    Yes        45     <60

Both of these observations were prisoners who were censored observations in our data. Given their long survival times, they could have both felt they no longer needed to be part of the study.